The current project is aimed to explore the crime rate in the current year. The dataset used in this project is found in this link which is provided by New York Police Department
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('gridExtra')) install.packages('gridExtra')
if (!require('dplyr')) install.packages('dplyr')
if (!require('leaflet')) install.packages('leaflet')
if (!require('scales')) install.packages('scales')
if (!require('readr')) install.packages('readr')
if (!require('dplyr')) install.packages('dplyr')
if (!require('ggmap')) install.packages('ggmap')
if (!require('RgoogleMaps')) install.packages('RgoogleMaps')
if (!require('tigris')) install.packages('tigris')
if(!require('leaflet')) install.packages('leaflet')
library(ggmap)
library(maptools)
library(broom)
library(httr)
library(rgdal)
library(scales)
if(!require(readr)) install.packages("readr")
if(!require(dplyr)) install.packages("dplyr")
if(!require(DT)) install.packages("DT")
if(!require(ggrepel)) install.packages("ggrepel")
if(!require(leaflet)) install.packages("leaflet")
Load the data using readr and read_csv(). # Importing data
# Import data
NYcrime <- read.csv("NYPD_Complaint_Data_2017.csv", header=TRUE, stringsAsFactors = FALSE, na.strings = c("", "NA"))
head(NYcrime)
#new Data analysis
path <- "C:\\Users\\patel\\Desktop\\SPS\\SPS_DATA_607\\final_project\\NYPD_Complaint_Data_Current__Year_To_Date_.csv"
df <- read_csv(path)
df_sub <- df[1:100,] # display the first 100 rows
df_sub$CMPLNT_FR_TM <- as.character(df_sub$CMPLNT_FR_TM)
head(df_sub)
sprintf("Number of Rows in Dataframe: %s", format(nrow(df),big.mark = ","))
## [1] "Number of Rows in Dataframe: 228,905"
### Preprocess Data
#The All-Caps text is difficult to read. Let's force the text in the appropriate columns into proper case.
proper_case <- function(x) {
return (gsub("\\b([A-Z])([A-Z]+)", "\\U\\1\\L\\2" , x, perl=TRUE))
}
library(dplyr)
df <- df %>% mutate(BORO_NM = proper_case(BORO_NM),
JURIS_DESC = proper_case(JURIS_DESC),
LAW_CAT_CD = proper_case(LAW_CAT_CD),
LOC_OF_OCCUR_DESC = proper_case(LOC_OF_OCCUR_DESC),
OFNS_DESC = proper_case(OFNS_DESC),
PARKS_NM = proper_case(PARKS_NM),
PATROL_BORO = proper_case(PATROL_BORO),
PD_DESC = proper_case(PD_DESC),
PREM_TYP_DESC = proper_case(PREM_TYP_DESC),
CMPLNT_FR_TM = as.character(CMPLNT_FR_TM))
df_sub <- df[1:100,] # display the first 100 rows
head(df_sub)
Display crime incident locations on the map using leaflet. Click icons on the map to show incident details.
library(leaflet)
data <- df[1:20000,] # display the first 10,000 rows
data$popup <- paste("<b>Incident #: </b>", data$CMPLNT_NUM, "<br>", "<b>Category: </b>", data$LAW_CAT_CD,
"<br>", "<b>Offence Description: </b>", data$OFNS_DESC,
"<br>", "<b>Day of week: </b>", data$DayOfWeek,
"<br>", "<b>Date: </b>", data$CMPLNT_FR_DT,
"<br>", "<b>Time: </b>", data$CMPLNT_FR_TM,
"<br>", "<b>PD Case: </b>", data$PD_CD,
"<br>", "<b>PD Description: </b>", data$PD_DESC,
"<br>", "<b>Longitude: </b>", data$Longitude,
"<br>", "<b>Latitude: </b>", data$Latitude)
## Warning: Unknown or uninitialised column: 'DayOfWeek'.
leaflet(data, width = "100%") %>% addTiles() %>%
addTiles(group = "OSM (default)") %>%
#addProviderTiles(provider = "Esri.WorldStreetMap",group = "World StreetMap") %>%
#addProviderTiles(provider = "Esri.WorldImagery",group = "World Imagery") %>%
# addProviderTiles(provider = "NASAGIBS.ViirsEarthAtNight2012",group = "Nighttime Imagery") %>%
addMarkers(lng = ~Longitude, lat = ~Latitude, popup = data$popup, clusterOptions = markerClusterOptions()) %>%
addLayersControl(
baseGroups = c("OSM (default)","World StreetMap", "World Imagery"),
options = layersControlOptions(collapsed = FALSE)
)
## Warning in validateCoords(lng, lat, funcName): Data contains 1 rows with
## either missing or invalid lat/lon values and will be ignored
Summarize the data by incident category.
df_category <- sort(table(df$LAW_CAT_CD),decreasing = TRUE)
df_category <- data.frame(df_category[df_category > 5000])
colnames(df_category) <- c("Category", "Frequency")
df_category$Percentage <- df_category$Frequency / sum(df_category$Frequency)*100
df_category
Create a bar plot based on the incident category.
library(ggplot2)
library(ggrepel)
bp<-ggplot(df_category, aes(x=Category, y=Frequency, fill=Category)) + geom_bar(stat="identity") +
theme(axis.text.x=element_blank()) + geom_text_repel(data=df_category, aes(label=Category))
bp
Summarize the data by incident category.
df_OFNS_DESC <- sort(table(df$OFNS_DESC),decreasing = TRUE)
df_OFNS_DESC <- data.frame(df_OFNS_DESC[df_OFNS_DESC > 3000])
colnames(df_OFNS_DESC) <- c("Category", "Frequency")
df_OFNS_DESC$Percentage <- df_OFNS_DESC$Frequency / sum(df_OFNS_DESC$Frequency)*100
df_OFNS_DESC
Create a bar plot based on the incident category.
library(ggplot2)
library(ggrepel)
ofns_cat<-ggplot(df_OFNS_DESC, aes(x=Category, y=Frequency, fill=Category)) + geom_bar(stat="identity") +
theme(axis.text.x=element_blank()) + geom_text_repel(data=df_OFNS_DESC, aes(label=Category))
ofns_cat
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(stringr)
data_dayOfWeek<-df
data_dayOfWeek$CMPLNT_FR_DT <- as.Date(data_dayOfWeek$CMPLNT_FR_DT,format = "%m/%d/%Y")
#data_dayOfWeek$day_of_week <- mdy(NYcrime_recent$CMPLNT_FR_DT)
data_dayOfWeek$day_of_week<- wday(data_dayOfWeek$CMPLNT_FR_DT, label=TRUE)
head(data_dayOfWeek)
Aggregate counts of thefts by Day-of-Week and Time to create heat map. Fortunately, the Day-Of-Week part is pre-derived, but Hour is slightly harder.
get_hour <- function(x) {
return (as.numeric(strsplit(x,":")[[1]][1]))
}
df_crime_time <- data_dayOfWeek %>%
mutate(Hour = sapply(CMPLNT_FR_TM, get_hour)) %>%
group_by(day_of_week, Hour) %>%
summarize(count = n())
# df_theft_time %>% head(10)
datatable(df_crime_time, options = list(scrollX='400px'))
#Reorder and format Factors.
dow_format <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_crime_time$day_of_week <- factor(df_crime_time$day_of_week, level = rev(dow_format))
df_crime_time$Hour <- factor(df_crime_time$Hour, level = 0:23, label = hour_format)
# df_theft_time %>% head(10)
head(df_crime_time)
Create Time Heatmap
plot <- ggplot(df_crime_time, aes(x = Hour, y = day_of_week, fill = count)) +geom_tile() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of crime (Local Time)", y = "Day of Week", title = "Number of crime in Crime reported by Time") +
scale_fill_gradient(low = "white", high = "#FF0000", labels = comma)
plot
data_Felony<-filter(data_dayOfWeek, LAW_CAT_CD == "Felony")
head(data_Felony)
df_Felony_time <- data_Felony %>%
mutate(Hour = sapply(CMPLNT_FR_TM, get_hour)) %>%
group_by(day_of_week, Hour) %>%
summarize(count = n())
# df_theft_time %>% head(10)
head(df_Felony_time)
#Reorder and format Factors.
dow_format <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_Felony_time$day_of_week <- factor(df_Felony_time$day_of_week, level = rev(dow_format))
df_Felony_time$Hour <- factor(df_Felony_time$Hour, level = 0:23, label = hour_format)
head(df_Felony_time)
felony_plot <- ggplot(df_Felony_time, aes(x = Hour, y = day_of_week, fill = count)) +geom_tile() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of Felony (Local Time)", y = "Day of Week", title = "Number of Felony reported by Time") +
scale_fill_gradient(low = "white", high = "#FF0000", labels = comma)
felony_plot
df_heat_time <- data_dayOfWeek
df_heat_time <- df_heat_time %>%
filter(!is.na(CMPLNT_FR_TM))
df_heat_time$Hour <- unlist(lapply(df_heat_time$CMPLNT_FR_TM, function (x) strsplit(as.character(x), ":", fixed=TRUE)[[1]][1]))
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_heat_time$Hour <- factor(df_heat_time$Hour , level = 0:23, label = hour_format)
df_heat_time$day_of_week <- as.factor(df_heat_time$day_of_week)
df_heat_time$OFNS_DESC <- as.factor(df_heat_time$OFNS_DESC)
data_Misdemeanor<-filter(data_dayOfWeek, LAW_CAT_CD == "Misdemeanor")
head(data_Misdemeanor)
df_Misdemeanor_time <- data_Misdemeanor %>%
mutate(Hour = sapply(CMPLNT_FR_TM, get_hour)) %>%
group_by(day_of_week, Hour) %>%
summarize(count = n())
# df_theft_time %>% head(10)
#Reorder and format Factors.
dow_format <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_Misdemeanor_time$day_of_week <- factor(df_Misdemeanor_time$day_of_week, level = rev(dow_format))
df_Misdemeanor_time$Hour <- factor(df_Misdemeanor_time$Hour, level = 0:23, label = hour_format)
# df_theft_time %>% head(10)
head(df_Misdemeanor_time)
Misdemeanor_time_plot <- ggplot(df_Misdemeanor_time, aes(x = Hour, y = day_of_week, fill = count)) +geom_tile() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of Misdemeanor (Local Time)", y = "Day of Week", title = "Number of Misdemeanor reported by Time") +
scale_fill_gradient(low = "white", high = "#FF0000", labels = comma)
Misdemeanor_time_plot
data_Violation<-filter(data_dayOfWeek, LAW_CAT_CD == "Violation")
head(data_Violation)
df_Violation_time <- data_Violation %>%
mutate(Hour = sapply(CMPLNT_FR_TM, get_hour)) %>%
group_by(day_of_week, Hour) %>%
summarize(count = n())
# df_theft_time %>% head(10)
head(df_Violation_time)
#Reorder and format Factors.
dow_format <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
df_Violation_time$day_of_week <- factor(df_Violation_time$day_of_week, level = rev(dow_format))
df_Violation_time$Hour <- factor(df_Violation_time$Hour, level = 0:23, label = hour_format)
# df_theft_time %>% head(10)
head(df_Violation_time)
Violation_time_plot <- ggplot(df_Violation_time, aes(x = Hour, y = day_of_week, fill = count)) +geom_tile() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
labs(x = "Hour of Violation (Local Time)", y = "Day of Week", title = "Number of Violation reported by Time") +
scale_fill_gradient(low = "white", high = "#FF0000", labels = comma)
Violation_time_plot
If crime is tied to activities, the period at which activies end may impact.
df_report_time_month <- data_dayOfWeek %>%
mutate(Month = format(as.Date(CMPLNT_FR_DT, "%m/%d/%Y"), "%B"), Hour = sapply(CMPLNT_FR_TM, get_hour)) %>%
group_by(Month, day_of_week, Hour) %>%
summarize(count = n()) %>%
group_by(Month) %>%
mutate(norm = count/sum(count))
head(df_report_time_month)
df_report_time_month$day_of_week <- factor(df_report_time_month$day_of_week, level = rev(dow_format))
df_report_time_month$Hour <- factor(df_report_time_month$Hour, level = 0:23, label = hour_format)
# Set order of month facets by chronological order instead of alphabetical
df_report_time_month$Month <- factor(df_report_time_month$Month, level = c("January","February","March","April","May","June","July","August","September","October","November","December"))
plot <- ggplot(df_report_time_month, aes(x = Hour, y = day_of_week, fill = count)) +
geom_tile() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
labs(x = "Hour of Arrest (Local Time)", y = "Day of Week", title = "Reported Crime 2018 by Time and, Normalized by Month") +
scale_fill_gradient(low = "White", high = "#FF0000") +
facet_wrap(~ Month, nrow = 6)
plot
NYcrime <- NYcrime %>%
filter(!is.na(BORO_NM))
borobp <- ggplot(NYcrime, aes(x = BORO_NM, fill=as.factor(BORO_NM))) +
geom_bar(width=0.9, stat="count") +
theme(legend.position="none") +
coord_flip()
borobp
boro.totals <- data.frame(table(NYcrime$BORO_NM))
names(boro.totals)[1] <- "Borough"
boro.totals
# NYC.gov has 2017 estimates at: 1471160, 2648771, 1664727, 2358582, and 479458 for BX, BK, MH, QN, and SI respectively.
boropops <- c(1471160, 2648771, 1664727, 2358582, 479458)
boro.totals[,"Freq"] <- ((boro.totals[,"Freq"]/boropops)*100)
scaled.boro.bp <- ggplot(boro.totals, aes(x= Borough, y = Freq, fill = as.factor(boro.totals$Borough))) +
geom_bar(width=0.9, stat="identity") +
ggtitle("Crime Records per Capita by Borough") +
theme(legend.position="none") +
coord_flip()
scaled.boro.bp
library(lubridate)
library(stringr)
#NYcrime_recent$CMPLNT_FR_DT <- as.Date(NYcrime_recent$CMPLNT_FR_DT,format = "%m/%d/%Y")
#NYcrime_recent <- NYcrime_recent %>%
# filter(!is.na(CMPLNT_FR_DT))
#NYcrime_recent$day_of_week <- mdy(NYcrime_recent$CMPLNT_FR_DT)
#NYcrime_recent$day_of_week<- wday(NYcrime_recent$CMPLNT_FR_DT, label=TRUE)
#head(NYcrime_recent)
NYcrime$OFNS_DESC <- as.factor(NYcrime$OFNS_DESC)
NYcrime_map <- NYcrime%>%
select(CMPLNT_NUM,BORO_NM,CMPLNT_FR_DT,LAW_CAT_CD,OFNS_DESC,VIC_RACE, VIC_SEX, Latitude,Longitude)%>%
filter(OFNS_DESC=="GRAND LARCENY" | OFNS_DESC=="PETIT LARCENY" | OFNS_DESC=="HARRASSMENT 2" | OFNS_DESC=="CRIMINAL MISCHIEF & RELATED OF" | OFNS_DESC=="OFF. AGNST PUB ORD SENSBLTY &" | OFNS_DESC== "THEFT-FRAUDS" | OFNS_DESC=="SEX CRIMES" | OFNS_DESC== "ASSAULT 3 & RELATED OFFENSES" | OFNS_DESC=="MISCELLANEOUS PENAL LAW" | OFNS_DESC== "FRAUDS")
NYcrime_map
Queens_map<- NYcrime_map%>%
filter(BORO_NM == "QUEENS")%>%
group_by(OFNS_DESC)
Queens_map
data_queens<-filter(data_dayOfWeek, BORO_NM == "Queens")
head(data_queens)
queens_OFNS_DESC <- sort(table(data_queens$OFNS_DESC),decreasing = TRUE)
queens_OFNS_DESC <- data.frame(queens_OFNS_DESC[queens_OFNS_DESC > 2000])
colnames(queens_OFNS_DESC) <- c("Category", "Frequency")
queens_OFNS_DESC$Percentage <- queens_OFNS_DESC$Frequency / sum(queens_OFNS_DESC$Frequency)*100
queens_OFNS_DESC
#queens
leaflet(data_queens, width = "100%") %>% addTiles() %>%
addTiles(group = "OSM (default)") %>%
#addProviderTiles(provider = "Esri.WorldStreetMap",group = "World StreetMap") %>%
#addProviderTiles(provider = "Esri.WorldImagery",group = "World Imagery") %>%
addProviderTiles(provider = "NASAGIBS.ViirsEarthAtNight2012",group = "Nighttime Imagery") %>%
addMarkers(lng = ~Longitude, lat = ~Latitude, popup = data$popup, clusterOptions = markerClusterOptions()) %>%
addLayersControl(
baseGroups = c("OSM (default)","World StreetMap", "Nighttime Imagery"),
options = layersControlOptions(collapsed = FALSE)
)
Queen <- c(left = -74.1, bottom = 40.46, right = -73.60, top = 40.84) map <- get_stamenmap(Queen, maptype = “toner-lite”) Queen_map<- ggmap(map)+ geom_point(data=Queens_map, aes(x=Longitude, y=Latitude, color=factor(Queens_map$OFNS_DESC)), alpha=1) + guides(colour = guide_legend(override.aes = list(alpha=1, size=5), title=“Type of Crime”)) + scale_colour_brewer(type=“qual”,palette=“Paired”) + ggtitle(“Top Crimes in Queens”) + theme_light(base_size=15) + theme(axis.line=element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank()) Queen_map
```
Brooklyn_data<- NYcrime_map%>%
filter(BORO_NM == "BROOKLYN")%>%
group_by(OFNS_DESC)
Brooklyn_data
BROOKLYN <- c(left = -74.04, bottom = 40.56, right = -73.85, top = 40.742)
map <- get_stamenmap(BROOKLYN, maptype = "toner-lite")
## Map from URL : http://tile.stamen.com/toner-lite/10/301/384.png
## Map from URL : http://tile.stamen.com/toner-lite/10/301/385.png
BROOKLYN_Map<- ggmap(map)+
geom_point(data=Brooklyn_data, aes(x=Longitude, y=Latitude, color=factor(Brooklyn_data$OFNS_DESC)), alpha=1.0) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=5),
title="Type of Crime")) +
scale_colour_brewer(type="qual",palette="Paired") +
ggtitle("Top Crimes in Brooklyn") +
theme_light(base_size=10) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
BROOKLYN_Map
Bronx_data<- NYcrime_map%>%
filter(BORO_NM == "BRONX")%>%
group_by(OFNS_DESC)
Bronx_data
BRONX <- c(left = -73.96, bottom = 40.74, right = -73.69, top = 40.95)
map <- get_stamenmap(BRONX, maptype = "toner-lite")
## Map from URL : http://tile.stamen.com/toner-lite/10/302/384.png
Bronx_Map<- ggmap(map)+
geom_point(data=Bronx_data, aes(x=Longitude, y=Latitude, color=factor(Bronx_data$OFNS_DESC)), alpha=1.0) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=5),
title="Type of Crime")) +
scale_colour_brewer(type="qual",palette="Paired") +
ggtitle("Top Crimes in BRONX") +
theme_light(base_size=10) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
Bronx_Map
MANHATTAN_data<- NYcrime_map%>%
filter(BORO_NM == "MANHATTAN")%>%
group_by(OFNS_DESC)
MANHATTAN_data
MANHATTAN <- c(left = -74.09, bottom = 40.69, right = -73.83, top = 40.89)
map <- get_stamenmap(MANHATTAN, maptype = "toner-lite")
MANHATTAN_Map<- ggmap(map)+
geom_point(data=MANHATTAN_data, aes(x=Longitude, y=Latitude, color=factor(MANHATTAN_data$OFNS_DESC)), alpha=1.0) +
guides(colour = guide_legend(override.aes = list(alpha=1, size=5),
title="Type of Crime")) +
scale_colour_brewer(type="qual",palette="Paired") +
ggtitle("Top Crimes in Manhattan") +
theme_light(base_size=10) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank())
MANHATTAN_Map